{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import ast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.read_csv(r'C:\\Users\\Yasaman\\Arab_spring_scholarly_attention\\Validating\\final_annotated.csv')\n",
    "df2=pd.read_excel(r\"C:\\Users\\Yasaman\\Downloads\\samples_1127.xlsx\")\n",
    "Map={row['ID']:row['Mentions'] for i,row in df2.iterrows()}\n",
    "df.loc[(df['SampleGroup'] == 'with_mention_arab') & (df['Mentions'].isna()), 'Mentions'] = df.loc[(df['SampleGroup'] == 'with_mention_arab') & (df['Mentions'].isna()), 'ID'].map(Map)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['union_annotation'] = df['union_annotation'].apply(ast.literal_eval)\n",
    "df['intersection_annotation'] = df['intersection_annotation'].apply(ast.literal_eval)\n",
    "df['Mentions'] = df['Mentions'].apply(ast.literal_eval)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "annotator_agreement\n",
       "True     891\n",
       "False    109\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['annotator_agreement'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Yasaman\\AppData\\Local\\Temp\\ipykernel_21972\\3424697931.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  sample_accuracies = df.groupby(\"SampleGroup\").apply(\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SampleGroup</th>\n",
       "      <th>accuracy_union</th>\n",
       "      <th>accuracy_intersection</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>with_mention_arab</td>\n",
       "      <td>0.840000</td>\n",
       "      <td>0.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>with_mention</td>\n",
       "      <td>0.928571</td>\n",
       "      <td>0.775000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>field_20</td>\n",
       "      <td>0.975000</td>\n",
       "      <td>0.959615</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         SampleGroup  accuracy_union  accuracy_intersection\n",
       "0  with_mention_arab        0.840000               0.750000\n",
       "1       with_mention        0.928571               0.775000\n",
       "2           field_20        0.975000               0.959615"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_accuracies = df.groupby(\"SampleGroup\").apply(\n",
    "    lambda x: pd.Series({\n",
    "        \"accuracy_union\": (x.apply(lambda row: set(row['Mentions']) == set(row['union_annotation']), axis=1).mean()),\n",
    "        \"accuracy_intersection\": (x.apply(lambda row: set(row['Mentions']) == set(row['intersection_annotation']), axis=1).mean())\n",
    "    })\n",
    ").reset_index()\n",
    "\n",
    "sample_accuracies = sample_accuracies.sort_values(by='accuracy_union').reset_index(drop=True)\n",
    "sample_accuracies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Mentions']=df[\"Mentions\"].apply(lambda x: sorted(x))\n",
    "df['union_annotation']=df[\"union_annotation\"].apply(lambda x: sorted(x))\n",
    "df['intersection_annotation']=df[\"intersection_annotation\"].apply(lambda x: sorted(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Yasaman\\AppData\\Local\\Temp\\ipykernel_21972\\292121645.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  sample_accuracies = df.groupby(\"SampleGroup\").apply(\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SampleGroup</th>\n",
       "      <th>jaccard_union</th>\n",
       "      <th>jaccard_intersection</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>with_mention_arab</td>\n",
       "      <td>0.905411</td>\n",
       "      <td>0.820729</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>with_mention</td>\n",
       "      <td>0.946131</td>\n",
       "      <td>0.804048</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>field_20</td>\n",
       "      <td>0.976923</td>\n",
       "      <td>0.960577</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         SampleGroup  jaccard_union  jaccard_intersection\n",
       "0  with_mention_arab       0.905411              0.820729\n",
       "1       with_mention       0.946131              0.804048\n",
       "2           field_20       0.976923              0.960577"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_accuracies = df.groupby(\"SampleGroup\").apply(\n",
    "    lambda x: pd.Series({\n",
    "        \"jaccard_union\": (\n",
    "            x.apply(\n",
    "                lambda row: len(set(row['Mentions']).intersection(set(row['union_annotation']))) /\n",
    "                            len(set(row['Mentions']).union(set(row['union_annotation'])))\n",
    "                if len(set(row['Mentions']).union(set(row['union_annotation']))) > 0 else 1,\n",
    "                axis=1\n",
    "            ).mean()\n",
    "        ),\n",
    "        \"jaccard_intersection\": (\n",
    "            x.apply(\n",
    "                lambda row: len(set(row['Mentions']).intersection(set(row['intersection_annotation']))) /\n",
    "                            len(set(row['Mentions']).union(set(row['intersection_annotation'])))\n",
    "                if len(set(row['Mentions']).union(set(row['intersection_annotation']))) > 0 else 1,\n",
    "                axis=1\n",
    "            ).mean()\n",
    "        )\n",
    "    })\n",
    ").reset_index()\n",
    "\n",
    "sample_accuracies = sample_accuracies.sort_values(by='jaccard_union').reset_index(drop=True)\n",
    "sample_accuracies"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
